Kapitel 6.2: Zentralität – Vektoren Distanzen¶

Das Notebook ergänzt Kapitel 6.2 'Zentralität'.

Import¶

In [1]:
import pandas as pd
import plotly.express as px
from tqdm.notebook import tqdm
from resources_geschichtslyrik import *

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from scipy.spatial import distance
from scipy.stats import entropy
In [2]:
meta = pd.read_json(r"../resources/meta.json")
In [3]:
features_used_df = pd.read_csv("../resources/more/vectors/vectordist_features.csv", index_col = [0])
meta_all_features = pd.read_csv("../resources/more/vectors/vectordist.csv", index_col = [0])
features_used = features_used_df['feature'].tolist()
In [4]:
meta_mode_strikt = pd.read_csv("../resources/more/vectors/mode_strikt.csv", index_col = [0])
meta_mode_flexibel = pd.read_csv("../resources/more/vectors/mode_flexibel.csv", index_col = [0])

Korpora¶

In [5]:
meta_anth = (
    meta
    .query("corpus=='anth'")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
    .reset_index(drop = True)
)
In [6]:
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']

meta_modcanon = (
    meta
    .query("author in @modcanon_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
    .reset_index(drop = True)
)
In [7]:
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']

meta_muench = (
    meta
    .query("author in @muench_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
    .reset_index(drop = True)
)
In [8]:
meta_all = pd.concat([meta_anth, meta_modcanon, meta_muench])
meta_all = meta_all.drop_duplicates(subset = 'id')
meta_all = meta_all.reset_index(drop = True)

meta_all['korpus_anth'] = [True if x in list(meta_anth['author_title']) else False for x in meta_all['author_title']]
meta_all['korpus_modcanon'] = [True if x in modcanon_authors else False for x in meta_all['author']]
meta_all['korpus_muench'] = [True if x in muench_authors else False for x in meta_all['author']]

meta_all.shape[0]
Out[8]:
2063
In [9]:
print(meta_all.shape[0])
print(meta_all_features.shape[0])
print(meta_mode_strikt.shape[0])
print(meta_mode_flexibel.shape[0])
2063
2063
2063
2063
In [10]:
meta_all = meta_all.merge(meta_all_features, on = 'id')
meta_all = meta_all.merge(meta_mode_strikt, on = 'id')
meta_all = meta_all.merge(meta_mode_flexibel, on = 'id')

meta_all.shape[0]
Out[10]:
2063

Feature-Übersicht¶

In [11]:
features_used_df
Out[11]:
feature encoding weight encoding_orig
0 vectortyp_geschichtslyrik ordinal 1.00 NaN
1 vectortyp_empirisch bin 1.00 NaN
2 vectortyp_theoretisch bin 1.00 NaN
3 vectortyp_sprechinstanz_markiert bin 1.00 NaN
4 vectortyp_konkretheit ordinal 1.00 NaN
... ... ... ... ...
1137 vectortyp_geschichtsauffassung_bewertung_ambiv... bin 0.20 nominal
1138 vectortyp_verhaeltnis_wissen_ergänzend bin 0.25 nominal_multi
1139 vectortyp_verhaeltnis_wissen_übereinstimmend bin 0.25 nominal_multi
1140 vectortyp_verhaeltnis_wissen_abweichend_überna... bin 0.25 nominal_multi
1141 vectortyp_verhaeltnis_wissen_abweichend_natürlich bin 0.25 nominal_multi

1142 rows × 4 columns

In [12]:
meta_all[[
    'vectortyp_geschichtslyrik', 
    'vectortyp_zeitebenen',
    'vectortyp_beginn',
    'vectortyp_entity_simple_1',
    'vectortyp_entity_bewertung_1_1',
    'vectortyp_stoffgebiete_dim_1',
]].sample(n=10)
Out[12]:
vectortyp_geschichtslyrik vectortyp_zeitebenen vectortyp_beginn vectortyp_entity_simple_1 vectortyp_entity_bewertung_1_1 vectortyp_stoffgebiete_dim_1
696 1.0 2.0 1860.0 0 0 -0.853739
107 1.0 1.0 1757.0 1 1 -0.987243
411 1.0 2.0 1631.0 1 0 -0.987243
1607 1.0 2.0 1627.0 1 1 -1.761696
147 1.0 2.0 1851.0 1 1 -1.618742
997 1.0 2.0 1189.0 0 0 -0.987243
1723 1.0 4.0 1229.0 2 0 -1.277816
1003 1.0 1.0 1190.0 1 1 2.394565
2007 1.0 2.0 1150.0 0 0 -1.903517
709 1.0 2.0 1871.0 1 1 -0.987243
In [13]:
round(features_used_df['weight'].sum(), 5)
Out[13]:
40.0

Weitere Gewichtung der Features¶

  • weights = Faktor, mit dem die jeweilige Feature-Gruppe gewichtet wird, z. B. 2 = doppelt so stark
In [14]:
weights = {
    'gattung' : 1,
    'stoffgebiete_dim' : 1,
    'beginn' : 1,
    'ende' : 1,
}
In [15]:
for weight in weights:
    this_index = features_used_df['feature'].str.contains(weight)
    current_values = features_used_df.loc[this_index, 'weight']
    
    if current_values.sum() != weights[weight]:
        features_used_df.loc[this_index, 'weight_orig'] = 1/sum(this_index)
        features_used_df.loc[this_index, 'weight'] = weights[weight]/sum(this_index)

Skalierung¶

In [16]:
# scaler = StandardScaler()
# scaler = RobustScaler()
scaler = MinMaxScaler()
In [17]:
meta_all[features_used] = scaler.fit_transform(meta_all[features_used])
In [18]:
meta_all[[
    'vectortyp_geschichtslyrik', 
    'vectortyp_zeitebenen',
    'vectortyp_beginn',
    'vectortyp_entity_simple_1',
    'vectortyp_entity_bewertung_1_1',
    'vectortyp_stoffgebiete_dim_1',
]].sample(n=10)
Out[18]:
vectortyp_geschichtslyrik vectortyp_zeitebenen vectortyp_beginn vectortyp_entity_simple_1 vectortyp_entity_bewertung_1_1 vectortyp_stoffgebiete_dim_1
1816 0.0 0.0 0.984538 0.00 0.00 0.352816
1887 0.0 0.0 0.651734 0.00 0.00 0.352816
117 0.0 0.2 0.990462 0.00 0.00 0.352816
145 0.0 0.4 0.982225 0.25 0.25 0.384847
1720 0.0 0.4 0.863150 0.50 0.50 0.278450
1662 0.0 0.4 0.919220 0.50 0.25 0.373682
530 0.0 0.0 0.981358 0.25 0.25 0.317260
605 0.0 0.2 0.913006 0.50 0.25 0.401228
1247 0.0 0.2 0.714884 0.25 0.00 0.352816
812 0.0 0.2 0.943642 0.50 0.25 0.805181

Texte mit fehlenden Daten ignorieren¶

In [19]:
for element in meta_all.iloc:
    missing = element[features_used].isnull()
    if sum(missing) > 0:
        print(element.id)
        print(element[features_used][missing.values])
        print("\n")
In [20]:
meta_all = meta_all.dropna(subset=features_used).copy()
meta_all = meta_all.reset_index(drop = True)
In [21]:
for element in meta_all.iloc:
    missing = element[features_used].isnull()
    if sum(missing) > 0:
        print(element.id)
        print(element[features_used][missing.values])
        print("\n")

Zentroid und Abstand zum Zentroid berechnen¶

In [22]:
centroid = meta_all.query("korpus_anth")[features_used].mean()
In [23]:
centroid
Out[23]:
vectortyp_geschichtslyrik                                0.000000
vectortyp_empirisch                                      0.997297
vectortyp_theoretisch                                    0.030270
vectortyp_sprechinstanz_markiert                         0.219189
vectortyp_konkretheit                                    0.902162
                                                           ...   
vectortyp_geschichtsauffassung_bewertung_ambivalent      0.001622
vectortyp_verhaeltnis_wissen_ergänzend                   0.747568
vectortyp_verhaeltnis_wissen_übereinstimmend             0.142703
vectortyp_verhaeltnis_wissen_abweichend_übernatürlich    0.107027
vectortyp_verhaeltnis_wissen_abweichend_natürlich        0.003243
Length: 1142, dtype: float64
In [24]:
for i, element in enumerate(meta_all.iloc):
    meta_all.at[i, 'dist_centroid_manhattan'] = distance.cityblock(element[features_used], centroid, w = features_used_df['weight'])
    meta_all.at[i, 'dist_centroid_euclidean'] = distance.euclidean(element[features_used], centroid, w = features_used_df['weight'])
    meta_all.at[i, 'dist_centroid_cosine'] = distance.cosine(element[features_used], centroid, w = features_used_df['weight'])
In [25]:
meta_all['dist_centroid_alldistances'] = (
    scaler.fit_transform(meta_all[['dist_centroid_manhattan']]) +
    scaler.fit_transform(meta_all[['dist_centroid_euclidean']]) +
    scaler.fit_transform(meta_all[['dist_centroid_cosine']])
) / 3
In [26]:
for i, element in enumerate(meta_all.iloc):
    meta_all.at[i, 'dist_centroid_manhattan_unweighted'] = distance.cityblock(element[features_used], centroid)
    meta_all.at[i, 'dist_centroid_euclidean_unweighted'] = distance.euclidean(element[features_used], centroid)
    meta_all.at[i, 'dist_centroid_cosine_unweighted'] = distance.cosine(element[features_used], centroid)
In [27]:
meta_all['dist_centroid_alldistances_unweighted'] = (
    scaler.fit_transform(meta_all[['dist_centroid_manhattan_unweighted']]) +
    scaler.fit_transform(meta_all[['dist_centroid_euclidean_unweighted']]) +
    scaler.fit_transform(meta_all[['dist_centroid_cosine_unweighted']])
) / 3

Tests¶

In [28]:
meta_all.drop_duplicates(subset="author_title")[[
    "author", "title", "year", "dist_centroid_euclidean",
]].sort_values(by = ["dist_centroid_euclidean", "author"], ascending = True).head(10)
Out[28]:
author title year dist_centroid_euclidean
666 Gruppe, Otto Friedrich Karl am Meere 1852.0 1.115235
1630 Müller von Königswinter, Wolfgang Das Zepter Rudolfs von Habsburg 1852.0 1.117486
1987 Münchhausen, Börries von Heerpauken 1914.0 1.120395
1740 Richter, Paul Brusehawer 1908.0 1.125591
1094 Meyer, Conrad Ferdinand Die Schweizer des Herrn von Tremouille 1875.0 1.126415
1488 Schrutz, Demetrius Der Langobardentrunk 1913.0 1.129181
1635 Groth, Klaus Graf Rudolf von Böklenburg 1853.0 1.131610
242 Zille, Moritz Alexander Bonifacius Tod 1851.0 1.133179
292 Brunold, Friedrich König Christian I. von Dänemark und Henning Wulf 1859.0 1.136554
1396 Lahmann, Johann Friedrich Heinrich IV. an der Elster 1890.0 1.137683
In [29]:
meta_all[[
    'mode_score_strikt', 
    'mode_score_flexibel',
    
    'dist_centroid_manhattan_unweighted', 
    'dist_centroid_euclidean_unweighted', 
    'dist_centroid_cosine_unweighted', 
    'dist_centroid_alldistances_unweighted',
    
    'dist_centroid_manhattan', 
    'dist_centroid_euclidean', 
    'dist_centroid_cosine', 
    'dist_centroid_alldistances',
]].corr()
Out[29]:
mode_score_strikt mode_score_flexibel dist_centroid_manhattan_unweighted dist_centroid_euclidean_unweighted dist_centroid_cosine_unweighted dist_centroid_alldistances_unweighted dist_centroid_manhattan dist_centroid_euclidean dist_centroid_cosine dist_centroid_alldistances
mode_score_strikt 1.000000 0.952823 -0.900802 -0.914699 -0.888176 -0.914241 -0.908581 -0.903217 -0.882121 -0.911031
mode_score_flexibel 0.952823 1.000000 -0.887655 -0.897019 -0.896656 -0.906832 -0.877581 -0.871240 -0.878015 -0.886252
dist_centroid_manhattan_unweighted -0.900802 -0.887655 1.000000 0.979699 0.940538 0.986774 0.880244 0.869071 0.866409 0.883292
dist_centroid_euclidean_unweighted -0.914699 -0.897019 0.979699 1.000000 0.952941 0.991597 0.896486 0.901334 0.874381 0.904018
dist_centroid_cosine_unweighted -0.888176 -0.896656 0.940538 0.952941 1.000000 0.979029 0.860968 0.856909 0.918110 0.885118
dist_centroid_alldistances_unweighted -0.914241 -0.906832 0.986774 0.991597 0.979029 1.000000 0.891894 0.888581 0.899602 0.903857
dist_centroid_manhattan -0.908581 -0.877581 0.880244 0.896486 0.860968 0.891894 1.000000 0.989582 0.947129 0.994697
dist_centroid_euclidean -0.903217 -0.871240 0.869071 0.901334 0.856909 0.888581 0.989582 1.000000 0.938454 0.992968
dist_centroid_cosine -0.882121 -0.878015 0.866409 0.874381 0.918110 0.899602 0.947129 0.938454 1.000000 0.969403
dist_centroid_alldistances -0.911031 -0.886252 0.883292 0.904018 0.885118 0.903857 0.994697 0.992968 0.969403 1.000000
In [30]:
px.box(
    meta_all,
    x = 'mode_score_flexibel',
    y = 'dist_centroid_manhattan',
    points = 'all',
    hover_data = ['id', 'author', 'title',]
)

Distanzen zwischen allen Texten berechnen und Distanzmatrix generieren¶

In [31]:
this_vectors = meta_all[features_used].to_numpy()
this_weights = features_used_df['weight'].values
In [32]:
dm_manhattan = distance.cdist(this_vectors, this_vectors, metric='cityblock', w=this_weights)
dm_manhattan = pd.DataFrame(dm_manhattan, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [33]:
dm_euclidean = distance.cdist(this_vectors, this_vectors, metric='euclidean', w=this_weights)
dm_euclidean = pd.DataFrame(dm_euclidean, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [34]:
dm_cosine = distance.cdist(this_vectors, this_vectors, metric='cosine', w=this_weights)
dm_cosine = pd.DataFrame(dm_cosine, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [35]:
dm_alldistances = (
    dm_manhattan/dm_manhattan.max().max() + # MinMaxScaling
    dm_euclidean/dm_euclidean.max().max() +
    dm_cosine/dm_cosine.max().max()
) / 3

dm_alldistances = pd.DataFrame(dm_alldistances)
dm_alldistances.index = dm_manhattan.index
dm_alldistances.columns = dm_manhattan.columns
In [36]:
dm_manhattan_unweighted = distance.cdist(this_vectors, this_vectors, metric='cityblock')
dm_manhattan_unweighted = pd.DataFrame(dm_manhattan_unweighted, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [37]:
dm_euclidean_unweighted = distance.cdist(this_vectors, this_vectors, metric='euclidean')
dm_euclidean_unweighted = pd.DataFrame(dm_euclidean_unweighted, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [38]:
dm_cosine_unweighted = distance.cdist(this_vectors, this_vectors, metric='cosine')
dm_cosine_unweighted = pd.DataFrame(dm_cosine_unweighted, index=meta_all['id'].tolist(), columns=meta_all['id'].tolist())
In [39]:
dm_alldistances_unweighted = (
    dm_manhattan_unweighted/dm_manhattan_unweighted.max().max() + # MinMaxScaling
    dm_euclidean_unweighted/dm_euclidean_unweighted.max().max() +
    dm_cosine_unweighted/dm_cosine_unweighted.max().max()
) / 3

dm_alldistances_unweighted = pd.DataFrame(dm_alldistances_unweighted)
dm_alldistances_unweighted.index = dm_manhattan_unweighted.index
dm_alldistances_unweighted.columns = dm_manhattan_unweighted.columns
In [40]:
# Filter indices for rows in `meta_anth_ids`
meta_anth_ids = meta_anth['id'].tolist()
meta_anth_indices = meta_all[meta_all['id'].isin(meta_anth_ids)].index

# Compute mean distances for each metric in a vectorized manner
meta_all['dist_mean_manhattan'] = dm_manhattan[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_euclidean'] = dm_euclidean[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_cosine'] = dm_cosine[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_alldistances'] = dm_alldistances[meta_anth_ids].mean(axis=1).values

meta_all['dist_mean_manhattan_unweighted'] = dm_manhattan_unweighted[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_euclidean_unweighted'] = dm_euclidean_unweighted[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_cosine_unweighted'] = dm_cosine_unweighted[meta_anth_ids].mean(axis=1).values
meta_all['dist_mean_alldistances_unweighted'] = dm_alldistances_unweighted[meta_anth_ids].mean(axis=1).values

Tests¶

In [41]:
meta_all.sort_values(by = "dist_mean_euclidean", ascending = True)[[
    'author', 'title', 'dist_mean_euclidean',
]].head(10)
Out[41]:
author title dist_mean_euclidean
666 Gruppe, Otto Friedrich Karl am Meere 1.998467
1630 Müller von Königswinter, Wolfgang Das Zepter Rudolfs von Habsburg 1.998481
1094 Meyer, Conrad Ferdinand Die Schweizer des Herrn von Tremouille 2.002944
1488 Schrutz, Demetrius Der Langobardentrunk 2.006236
1635 Groth, Klaus Graf Rudolf von Böklenburg 2.006753
292 Brunold, Friedrich König Christian I. von Dänemark und Henning Wulf 2.008788
242 Zille, Moritz Alexander Bonifacius Tod 2.010288
1396 Lahmann, Johann Friedrich Heinrich IV. an der Elster 2.010472
27 Grimm, Herman Die Tochter des Langobardenkönigs 2.010544
1629 Halm, Friedrich Friedrich mit der gebissenen Wange 2.012063
In [42]:
meta_all[[
    'mode_score_strikt', 'mode_score_flexibel', 
    
    'dist_centroid_manhattan_unweighted', 'dist_centroid_euclidean_unweighted', 
    'dist_centroid_cosine_unweighted', 'dist_centroid_alldistances_unweighted',
    'dist_centroid_manhattan', 'dist_centroid_euclidean', 'dist_centroid_cosine', 'dist_centroid_alldistances',
    
    'dist_mean_manhattan', 'dist_mean_euclidean', 'dist_mean_cosine', 'dist_mean_alldistances',
    'dist_mean_manhattan_unweighted', 'dist_mean_euclidean_unweighted', 'dist_mean_cosine_unweighted', 
    'dist_mean_alldistances_unweighted'
]].corr()
Out[42]:
mode_score_strikt mode_score_flexibel dist_centroid_manhattan_unweighted dist_centroid_euclidean_unweighted dist_centroid_cosine_unweighted dist_centroid_alldistances_unweighted dist_centroid_manhattan dist_centroid_euclidean dist_centroid_cosine dist_centroid_alldistances dist_mean_manhattan dist_mean_euclidean dist_mean_cosine dist_mean_alldistances dist_mean_manhattan_unweighted dist_mean_euclidean_unweighted dist_mean_cosine_unweighted dist_mean_alldistances_unweighted
mode_score_strikt 1.000000 0.952823 -0.900802 -0.914699 -0.888176 -0.914241 -0.908581 -0.903217 -0.882121 -0.911031 -0.908174 -0.906328 -0.887145 -0.911418 -0.904079 -0.919082 -0.891435 -0.915807
mode_score_flexibel 0.952823 1.000000 -0.887655 -0.897019 -0.896656 -0.906832 -0.877581 -0.871240 -0.878015 -0.886252 -0.877557 -0.873846 -0.880373 -0.887011 -0.887584 -0.901716 -0.898782 -0.907626
dist_centroid_manhattan_unweighted -0.900802 -0.887655 1.000000 0.979699 0.940538 0.986774 0.880244 0.869071 0.866409 0.883292 0.881619 0.870721 0.869441 0.884200 0.998595 0.980713 0.942623 0.986269
dist_centroid_euclidean_unweighted -0.914699 -0.897019 0.979699 1.000000 0.952941 0.991597 0.896486 0.901334 0.874381 0.904018 0.899069 0.901863 0.878798 0.903991 0.983311 0.999396 0.955470 0.990811
dist_centroid_cosine_unweighted -0.888176 -0.896656 0.940538 0.952941 1.000000 0.979029 0.860968 0.856909 0.918110 0.885118 0.862470 0.857055 0.916545 0.886616 0.942132 0.954520 0.999925 0.979573
dist_centroid_alldistances_unweighted -0.914241 -0.906832 0.986774 0.991597 0.979029 1.000000 0.891894 0.888581 0.899602 0.903857 0.893752 0.889345 0.901565 0.904659 0.988128 0.992260 0.980548 0.999754
dist_centroid_manhattan -0.908581 -0.877581 0.880244 0.896486 0.860968 0.891894 1.000000 0.989582 0.947129 0.994697 0.998880 0.991083 0.954510 0.994123 0.880928 0.901617 0.865158 0.892813
dist_centroid_euclidean -0.903217 -0.871240 0.869071 0.901334 0.856909 0.888581 0.989582 1.000000 0.938454 0.992968 0.990802 0.999380 0.946257 0.991381 0.871650 0.904848 0.861245 0.888947
dist_centroid_cosine -0.882121 -0.878015 0.866409 0.874381 0.918110 0.899602 0.947129 0.938454 1.000000 0.969403 0.948296 0.938923 0.999565 0.971236 0.866761 0.879664 0.919263 0.901389
dist_centroid_alldistances -0.911031 -0.886252 0.883292 0.904018 0.885118 0.903857 0.994697 0.992968 0.969403 1.000000 0.995098 0.993368 0.975010 0.999639 0.884659 0.908604 0.888628 0.904792
dist_mean_manhattan -0.908174 -0.877557 0.881619 0.899069 0.862470 0.893752 0.998880 0.990802 0.948296 0.995098 1.000000 0.992528 0.955772 0.995407 0.883703 0.904222 0.866736 0.895136
dist_mean_euclidean -0.906328 -0.873846 0.870721 0.901863 0.857055 0.889345 0.991083 0.999380 0.938923 0.993368 0.992528 1.000000 0.946930 0.992440 0.873482 0.906108 0.861505 0.890062
dist_mean_cosine -0.887145 -0.880373 0.869441 0.878798 0.916545 0.901565 0.954510 0.946257 0.999565 0.975010 0.955772 0.946930 1.000000 0.976854 0.870009 0.884124 0.918031 0.903375
dist_mean_alldistances -0.911418 -0.887011 0.884200 0.903991 0.886616 0.904659 0.994123 0.991381 0.971236 0.999639 0.995407 0.992440 0.976854 1.000000 0.886078 0.908950 0.890155 0.905964
dist_mean_manhattan_unweighted -0.904079 -0.887584 0.998595 0.983311 0.942132 0.988128 0.880928 0.871650 0.866761 0.884659 0.883703 0.873482 0.870009 0.886078 1.000000 0.984388 0.944320 0.988446
dist_mean_euclidean_unweighted -0.919082 -0.901716 0.980713 0.999396 0.954520 0.992260 0.901617 0.904848 0.879664 0.908604 0.904222 0.906108 0.884124 0.908950 0.984388 1.000000 0.957110 0.991975
dist_mean_cosine_unweighted -0.891435 -0.898782 0.942623 0.955470 0.999925 0.980548 0.865158 0.861245 0.919263 0.888628 0.866736 0.861505 0.918031 0.890155 0.944320 0.957110 1.000000 0.981126
dist_mean_alldistances_unweighted -0.915807 -0.907626 0.986269 0.990811 0.979573 0.999754 0.892813 0.888947 0.901389 0.904792 0.895136 0.890062 0.903375 0.905964 0.988446 0.991975 0.981126 1.000000

Export¶

In [43]:
dm_manhattan.to_csv("../resources/more/vectors/vectordist_dm_manhattan.csv")
dm_euclidean.to_csv("../resources/more/vectors/vectordist_dm_euclidean.csv")
dm_cosine.to_csv("../resources/more/vectors/vectordist_dm_cosine.csv")
dm_alldistances.to_csv("../resources/more/vectors/vectordist_dm_alldistances.csv")
In [44]:
dm_manhattan_unweighted.to_csv("../resources/more/vectors/vectordist_dm_manhattan_unweighted.csv")
dm_euclidean_unweighted.to_csv("../resources/more/vectors/vectordist_dm_euclidean_unweighted.csv")
dm_cosine_unweighted.to_csv("../resources/more/vectors/vectordist_dm_cosine_unweighted.csv")
dm_alldistances_unweighted.to_csv("../resources/more/vectors/vectordist_dm_alldistances_unweighted.csv")
In [45]:
export_meta = meta_all[
    ['id'] + [
        'dist_centroid_manhattan', 'dist_centroid_euclidean', 'dist_centroid_cosine', 'dist_centroid_alldistances',
        'dist_centroid_manhattan_unweighted', 'dist_centroid_euclidean_unweighted', 
        'dist_centroid_cosine_unweighted', 'dist_centroid_alldistances_unweighted',
        
        'dist_mean_manhattan', 'dist_mean_euclidean', 'dist_mean_cosine', 'dist_mean_alldistances',
        'dist_mean_manhattan_unweighted', 'dist_mean_euclidean_unweighted', 
        'dist_mean_cosine_unweighted', 'dist_mean_alldistances_unweighted',
    ]
]
export_meta.to_csv("../resources/more/vectors/vectordist_dists.csv")
In [46]:
export_meta.head()
Out[46]:
id dist_centroid_manhattan dist_centroid_euclidean dist_centroid_cosine dist_centroid_alldistances dist_centroid_manhattan_unweighted dist_centroid_euclidean_unweighted dist_centroid_cosine_unweighted dist_centroid_alldistances_unweighted dist_mean_manhattan dist_mean_euclidean dist_mean_cosine dist_mean_alldistances dist_mean_manhattan_unweighted dist_mean_euclidean_unweighted dist_mean_cosine_unweighted dist_mean_alldistances_unweighted
0 1850.Grube.028 6.046027 1.471513 0.112839 0.147303 22.668557 3.031454 0.180582 0.289637 6.231232 2.267290 0.232499 0.375783 23.948690 4.405008 0.330064 0.527177
1 1850.Kriebitzsch.001 11.063399 2.707721 0.389565 0.713911 32.726389 4.285712 0.364058 0.737184 11.405351 3.231843 0.474081 0.634671 33.671350 5.358220 0.480409 0.708324
2 1850.Kriebitzsch.011 7.459599 1.886688 0.138009 0.282977 23.414010 3.227017 0.182180 0.322968 7.579034 2.569597 0.257942 0.432729 24.557230 4.544311 0.332335 0.538962
3 1850.Kriebitzsch.019 6.586925 1.632293 0.109394 0.190895 24.211065 3.304033 0.186562 0.346566 6.665785 2.376551 0.231034 0.391330 25.070353 4.596473 0.335240 0.546190
4 1851.Müller/Kletke.018 8.133864 2.118448 0.194718 0.382714 27.811599 3.832658 0.243543 0.507471 8.428754 2.756721 0.306317 0.480857 28.988117 5.006438 0.382807 0.614177